import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
sn.set_style("darkgrid")
Importing Datasets
red_wine_df=pd.read_csv('winequality-red.csv', delimiter=';')
white_wine_df=pd.read_csv('winequality-white.csv', delimiter=';')
Red Wine Dataset Columns
red_wine_df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
White Wine Dataset Columns
white_wine_df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
Shwoing the first five rows
red_wine_df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
Showing the dataset shape
red_wine_df.shape
(1599, 12)
Showing the total information
red_wine_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
Showing the null values
red_wine_df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
Showing the duplicates
red_wine_df.duplicated().sum()
240
Descriptive Statistics
red_wine_df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
red_wine_df['quality'].value_counts().plot(kind='bar', figsize=(10,6), color=sn.color_palette('viridis'))
<AxesSubplot: >
sn.pairplot(red_wine_df, hue='quality')
<seaborn.axisgrid.PairGrid at 0x240c2e5b790>
plt.figure(figsize=(12,8))
sn.heatmap(red_wine_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
<AxesSubplot: >
red_wine_df.corr()['quality'].plot(kind='bar', figsize=(15,8))
<AxesSubplot: >
plt.figure(figsize=(10,6))
sn.histplot(red_wine_df['alcohol'], kde=True, palette='mako')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\2835092100.py:3: UserWarning: Ignoring `palette` because no `hue` variable has been assigned. sn.histplot(red_wine_df['alcohol'], kde=True, palette='mako')
<AxesSubplot: xlabel='alcohol', ylabel='Count'>
Skewness
from scipy.stats import skew
skew(red_wine_df['alcohol'])
0.8600210646566755
Mean
red_wine_df['alcohol'].mean()
10.422983114446529
Median
red_wine_df['alcohol'].median()
10.2
To not showing the outliers we use here showfliers=False
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='alcohol', data=red_wine_df, showfliers=False, palette='dark')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\3473818432.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='alcohol', data=red_wine_df, showfliers=False, palette='dark')
<AxesSubplot: xlabel='quality', ylabel='alcohol'>
plt.figure(figsize=(10,8))
sn.jointplot(x='alcohol', y='pH', data=red_wine_df, kind='reg')
<seaborn.axisgrid.JointGrid at 0x240b49f70a0>
<Figure size 720x576 with 0 Axes>
It's a positive co-relation.
from scipy.stats import pearsonr
correlation_coefficient, p_value = pearsonr(red_wine_df['alcohol'], red_wine_df['pH'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: 0.20563250850549833 P-value: 9.964497741457687e-17
plt.figure(figsize=(10,8))
sn.jointplot(x='alcohol', y='density', data=red_wine_df, kind='reg')
<seaborn.axisgrid.JointGrid at 0x240b6e41e70>
<Figure size 720x576 with 0 Axes>
correlation_coefficient, p_value = pearsonr(red_wine_df['alcohol'], red_wine_df['density'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.4961797702417019 P-value: 3.9388353399870764e-100
g=sn.FacetGrid(red_wine_df, col='quality')
g=g.map(sn.regplot, 'density','alcohol')
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='sulphates', data=red_wine_df, showfliers=False, palette='magma')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\1552758158.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='sulphates', data=red_wine_df, showfliers=False, palette='magma')
<AxesSubplot: xlabel='quality', ylabel='sulphates'>
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='total sulfur dioxide', data=red_wine_df, showfliers=False, palette='colorblind')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\1925034777.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='total sulfur dioxide', data=red_wine_df, showfliers=False, palette='colorblind')
<AxesSubplot: xlabel='quality', ylabel='total sulfur dioxide'>
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='free sulfur dioxide', data=red_wine_df, showfliers=False, palette='Set3')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\2347094703.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='free sulfur dioxide', data=red_wine_df, showfliers=False, palette='Set3')
<AxesSubplot: xlabel='quality', ylabel='free sulfur dioxide'>
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='fixed acidity', data=red_wine_df, palette='Set2')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\900932625.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='fixed acidity', data=red_wine_df, palette='Set2')
<AxesSubplot: xlabel='quality', ylabel='fixed acidity'>
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='citric acid', data=red_wine_df, palette='husl')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\1796001089.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='citric acid', data=red_wine_df, palette='husl')
<AxesSubplot: xlabel='quality', ylabel='citric acid'>
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='volatile acidity', data=red_wine_df, palette='rainbow')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\2572236511.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='volatile acidity', data=red_wine_df, palette='rainbow')
<AxesSubplot: xlabel='quality', ylabel='volatile acidity'>
red_wine_df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
correlation_coefficient, p_value = pearsonr(red_wine_df['pH'], red_wine_df['volatile acidity'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: 0.23493729440739328 P-value: 1.7189939570061834e-21
red_wine_df['total acidity']=(red_wine_df['fixed acidity']+ red_wine_df['citric acid']+ red_wine_df['volatile acidity'])
plt.figure(figsize=(10,6))
sn.boxplot(x='quality', y='total acidity', data=red_wine_df, palette='mako')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\3485094029.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='total acidity', data=red_wine_df, palette='mako')
<AxesSubplot: xlabel='quality', ylabel='total acidity'>
plt.figure(figsize=(10,6))
sn.regplot(x='pH', y='total acidity', data=red_wine_df)
<AxesSubplot: xlabel='pH', ylabel='total acidity'>
g=sn.FacetGrid(red_wine_df, col='quality')
g=g.map(sn.regplot, 'total acidity','pH')
correlation_coefficient, p_value = pearsonr(red_wine_df['pH'], red_wine_df['total acidity'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.6834838221663891 P-value: 1.442656031677709e-220
Showing the first five rows
white_wine_df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
Showing the last five rows
white_wine_df.tail()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 6 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 5 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 6 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 7 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 6 |
Showing the dataset shape
white_wine_df.shape
(4898, 12)
Showing the total information of the dataset
white_wine_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4898 entries, 0 to 4897 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 4898 non-null float64 1 volatile acidity 4898 non-null float64 2 citric acid 4898 non-null float64 3 residual sugar 4898 non-null float64 4 chlorides 4898 non-null float64 5 free sulfur dioxide 4898 non-null float64 6 total sulfur dioxide 4898 non-null float64 7 density 4898 non-null float64 8 pH 4898 non-null float64 9 sulphates 4898 non-null float64 10 alcohol 4898 non-null float64 11 quality 4898 non-null int64 dtypes: float64(11), int64(1) memory usage: 459.3 KB
Checking Null Values
white_wine_df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
Showing the duplicates
white_wine_df.duplicated().sum()
937
Showing the unique columns
white_wine_df.nunique()
fixed acidity 68 volatile acidity 125 citric acid 87 residual sugar 310 chlorides 160 free sulfur dioxide 132 total sulfur dioxide 251 density 890 pH 103 sulphates 79 alcohol 103 quality 7 dtype: int64
Descriptive Statistics
white_wine_df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
white_wine_df['quality'].value_counts().plot(kind='bar', figsize=(10,6), color=sn.color_palette('magma'))
<AxesSubplot: >
sn.pairplot(white_wine_df, hue='quality')
<seaborn.axisgrid.PairGrid at 0x240c2e5bf70>
sn.set(rc={'figure.figsize':(11,7)})
sn.heatmap(white_wine_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
<AxesSubplot: >
white_wine_df.corr()['quality'].plot(kind='bar', figsize=(15,8))
<AxesSubplot: >
sn.histplot(white_wine_df['alcohol'], kde=True)
<AxesSubplot: xlabel='alcohol', ylabel='Count'>
Skewness
from scipy.stats import skew
skew(white_wine_df['alcohol'])
0.48719273327634327
The mean
white_wine_df['alcohol'].mean()
10.514267047774602
The median
white_wine_df['alcohol'].median()
10.4
sn.boxplot(x='quality', y='alcohol', data=white_wine_df, palette='Set2')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\908718883.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='alcohol', data=white_wine_df, palette='Set2')
<AxesSubplot: xlabel='quality', ylabel='alcohol'>
correlation_coefficient, p_value = pearsonr(white_wine_df['alcohol'], white_wine_df['pH'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: 0.12143209874912966 P-value: 1.4900595881932524e-17
sn.boxplot(x='quality', y='pH', data=white_wine_df, palette='Set3')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\3452518404.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='pH', data=white_wine_df, palette='Set3')
<AxesSubplot: xlabel='quality', ylabel='pH'>
joint_plot=sn.jointplot(x='alcohol', y='density', data=white_wine_df, kind='reg', palette='mako')
correlation_coefficient, p_value = pearsonr(white_wine_df['alcohol'], white_wine_df['density'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.7801376214255598 P-value: 0.0
g=sn.FacetGrid(white_wine_df, col='quality')
g=g.map(sn.regplot, 'pH','alcohol')
sn.boxplot(x='quality', y='sulphates', data=white_wine_df, palette='mako')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\408880811.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='sulphates', data=white_wine_df, palette='mako')
<AxesSubplot: xlabel='quality', ylabel='sulphates'>
sn.boxplot(x='quality', y='total sulfur dioxide', data=white_wine_df, palette='magma')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\3293237333.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='total sulfur dioxide', data=white_wine_df, palette='magma')
<AxesSubplot: xlabel='quality', ylabel='total sulfur dioxide'>
correlation_coefficient, p_value = pearsonr(white_wine_df['quality'], white_wine_df['total sulfur dioxide'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.17473721759706368 P-value: 6.991898124258417e-35
sn.boxplot(x='quality', y='free sulfur dioxide', data=white_wine_df, palette='husl')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\4015704598.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='free sulfur dioxide', data=white_wine_df, palette='husl')
<AxesSubplot: xlabel='quality', ylabel='free sulfur dioxide'>
correlation_coefficient, p_value = pearsonr(white_wine_df['quality'], white_wine_df['free sulfur dioxide'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: 0.008158067123436157 P-value: 0.5681271459219848
sn.boxplot(x='quality', y='volatile acidity', data=white_wine_df, palette='colorblind')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\1497120699.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='volatile acidity', data=white_wine_df, palette='colorblind')
<AxesSubplot: xlabel='quality', ylabel='volatile acidity'>
correlation_coefficient, p_value = pearsonr(white_wine_df['quality'], white_wine_df['volatile acidity'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.19472296892113533 P-value: 4.673261270702513e-43
joint_plot=sn.jointplot(x='residual sugar', y='density', data=white_wine_df, kind='reg')
white_wine_df['total acidity']=(white_wine_df['fixed acidity']+white_wine_df['citric acid']+ white_wine_df['volatile acidity'])
sn.boxplot(x='quality', y='total acidity', data=white_wine_df, palette='Set1')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\3417362548.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='total acidity', data=white_wine_df, palette='Set1')
<AxesSubplot: xlabel='quality', ylabel='total acidity'>
correlation_coefficient, p_value = pearsonr(white_wine_df['quality'], white_wine_df['total acidity'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.13137720684953472 P-value: 2.6507804041318808e-20
sn.boxplot(x='quality', y='citric acid', data=white_wine_df, palette='Set3')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\529856444.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='citric acid', data=white_wine_df, palette='Set3')
<AxesSubplot: xlabel='quality', ylabel='citric acid'>
joint_plot=sn.jointplot(x='pH', y='citric acid', data=white_wine_df, kind='reg')
correlation_coefficient, p_value = pearsonr(white_wine_df['pH'], white_wine_df['citric acid'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)
Pearson correlation coefficient: -0.16374821140062382 P-value: 8.783728611505257e-31
sn.boxplot(x='quality', y='residual sugar', data=white_wine_df, palette='Set2')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\2414749223.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='residual sugar', data=white_wine_df, palette='Set2')
<AxesSubplot: xlabel='quality', ylabel='residual sugar'>
white_wine_df['Crisp Ratio']=white_wine_df['total acidity'] / white_wine_df['residual sugar']
sn.boxplot(x='quality', y='Crisp Ratio', data=white_wine_df, showfliers=False, palette='dark')
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\936864896.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sn.boxplot(x='quality', y='Crisp Ratio', data=white_wine_df, showfliers=False, palette='dark')
<AxesSubplot: xlabel='quality', ylabel='Crisp Ratio'>
red_wine_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 12 total acidity 1599 non-null float64 dtypes: float64(12), int64(1) memory usage: 162.5 KB
white_wine_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4898 entries, 0 to 4897 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 4898 non-null float64 1 volatile acidity 4898 non-null float64 2 citric acid 4898 non-null float64 3 residual sugar 4898 non-null float64 4 chlorides 4898 non-null float64 5 free sulfur dioxide 4898 non-null float64 6 total sulfur dioxide 4898 non-null float64 7 density 4898 non-null float64 8 pH 4898 non-null float64 9 sulphates 4898 non-null float64 10 alcohol 4898 non-null float64 11 quality 4898 non-null int64 12 total acidity 4898 non-null float64 13 Crisp Ratio 4898 non-null float64 dtypes: float64(13), int64(1) memory usage: 535.8 KB
red_wine_df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | total acidity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 | 9.118433 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 | 1.832708 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 | 5.270000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 | 7.827500 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 | 8.720000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 | 10.070000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 | 17.045000 |
white_wine_df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | total acidity | Crisp Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 | 7.467220 | 2.532658 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 | 0.887962 | 2.249911 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 | 4.130000 | 0.142325 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 | 6.890000 | 0.776772 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 | 7.405000 | 1.384058 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 | 7.960000 | 4.256250 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 | 14.960000 | 15.483333 |
sn.heatmap(red_wine_df.corr(), annot=True, cmap='viridis',fmt='.2f')
<AxesSubplot: >
sn.heatmap(white_wine_df.corr(), annot=True, cmap='viridis', fmt='.2f')
<AxesSubplot: >
red_wine_df['type']='Red'
white_wine_df['type']='White'
wines_df=pd.concat([red_wine_df, white_wine_df])
wines_df
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | total acidity | type | Crisp Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 8.10 | Red | NaN |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 8.68 | Red | NaN |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 8.60 | Red | NaN |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 12.04 | Red | NaN |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 8.10 | Red | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 6 | 6.70 | White | 4.187500 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 5 | 7.28 | White | 0.910000 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 6 | 6.93 | White | 5.775000 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 7 | 6.09 | White | 5.536364 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 6 | 6.59 | White | 8.237500 |
6497 rows × 15 columns
wines_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6497 entries, 0 to 4897 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 6497 non-null float64 1 volatile acidity 6497 non-null float64 2 citric acid 6497 non-null float64 3 residual sugar 6497 non-null float64 4 chlorides 6497 non-null float64 5 free sulfur dioxide 6497 non-null float64 6 total sulfur dioxide 6497 non-null float64 7 density 6497 non-null float64 8 pH 6497 non-null float64 9 sulphates 6497 non-null float64 10 alcohol 6497 non-null float64 11 quality 6497 non-null int64 12 total acidity 6497 non-null float64 13 type 6497 non-null object 14 Crisp Ratio 4898 non-null float64 dtypes: float64(13), int64(1), object(1) memory usage: 812.1+ KB
sn.countplot(x='quality', hue='type', data=wines_df)
<AxesSubplot: xlabel='quality', ylabel='count'>
p1=sn.kdeplot(red_wine_df['quality'], shade=True, color='r', label='red whine')
p2=sn.kdeplot(white_wine_df['quality'], shade=True, color='b', label='white whine')
plt.legend()
plt.show()
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\608723478.py:1: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. p1=sn.kdeplot(red_wine_df['quality'], shade=True, color='r', label='red whine') C:\Users\USER\AppData\Local\Temp\ipykernel_7016\608723478.py:2: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. p2=sn.kdeplot(white_wine_df['quality'], shade=True, color='b', label='white whine')
sn.boxplot(x='quality', y='alcohol', hue='type', data=wines_df, palette=['r','w'])
<AxesSubplot: xlabel='quality', ylabel='alcohol'>
sn.boxplot(x='quality', y='density', hue='type', data=wines_df, palette=['r','w'], showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='density'>
sn.jointplot(x='alcohol', y='residual sugar', data=wines_df, hue='type')
<seaborn.axisgrid.JointGrid at 0x240be27df90>
sn.boxplot(x='quality', y='residual sugar', hue='type', data=wines_df, palette=['r','w'], showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='residual sugar'>
p1=sn.kdeplot(red_wine_df['residual sugar'], shade=True, color='r', label='red whine')
p2=sn.kdeplot(white_wine_df['residual sugar'], shade=True, color='b', label='white whine')
plt.legend()
plt.show()
C:\Users\USER\AppData\Local\Temp\ipykernel_7016\227625680.py:1: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. p1=sn.kdeplot(red_wine_df['residual sugar'], shade=True, color='r', label='red whine') C:\Users\USER\AppData\Local\Temp\ipykernel_7016\227625680.py:2: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. p2=sn.kdeplot(white_wine_df['residual sugar'], shade=True, color='b', label='white whine')
sn.regplot(x='alcohol', y='residual sugar', data=wines_df)
<AxesSubplot: xlabel='alcohol', ylabel='residual sugar'>
sn.boxplot(x='quality', y='total sulfur dioxide', hue='type', data=wines_df, palette=['r','w'])
<AxesSubplot: xlabel='quality', ylabel='total sulfur dioxide'>
sn.boxplot(x='quality', y='free sulfur dioxide', hue='type', data=wines_df, palette=['r','w'], showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='free sulfur dioxide'>
sn.boxplot(x='quality', y='sulphates', hue='type', data=wines_df, palette=['r','w'],showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='sulphates'>
sn.boxplot(x='quality', y='citric acid', hue='type', data=wines_df, palette=['r','w'],showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='citric acid'>
sn.boxplot(x='quality', y='chlorides', hue='type', data=wines_df, palette=['r','w'],showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='chlorides'>
wines_df['total acidity']=wines_df['fixed acidity'] + wines_df['volatile acidity'] + wines_df['citric acid']
sn.boxplot(x='quality', y='total acidity', hue='type', data=wines_df,
palette=['r','w'], showfliers=False)
<AxesSubplot: xlabel='quality', ylabel='total acidity'>